# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# CLUSTERING

# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install scikit-learn;
! pip install matplotlib;
! pip install scipy;

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree
from sklearn.preprocessing import StandardScaler

# 0. Review of principal components – another unsupervised learning method
# Load the dataset
url = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/USArrests.csv"
USArrests = pd.read_csv(url, index_col=0)

# Standardize data
X = StandardScaler().fit_transform(USArrests)
features = USArrests.columns  # Get column names for the features

# Add state names as row labels (in this case, USArrests' index has state names)
state_names = USArrests.index

# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Plot the PCA components
plt.figure(figsize=(12, 9))
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Biplot with State Labels and Enhanced Arrows")

# Add labels for each state
for i, state in enumerate(state_names):
    plt.text(X_pca[i, 0], X_pca[i, 1], state, ha='right', color='blue', fontsize=8)

# Plot arrows (loadings) for each feature
for i, feature in enumerate(features):
    plt.arrow(0, 0, 
              pca.components_[0, i] * 2,  # Adjust 2 to make arrows longer or shorter
              pca.components_[1, i] * 2, 
              color='red', 
              width=0.02,  # Thickness of the arrow
              head_width=0.1)  # Width of the arrowhead
    plt.text(pca.components_[0, i] * 2.2, pca.components_[1, i] * 2.2, 
             feature, color='red', ha='center', va='center')

plt.grid()
plt.show()

# 1. K-means method
# K-means clustering with K=2
kmeans_2 = KMeans(n_clusters=2, random_state=42)
clusters_2 = kmeans_2.fit_predict(X_scaled)

# Cluster means
print('Cluster means:\n', kmeans_2.cluster_centers_)

# Clustering vector
USArrests['Cluster'] = clusters_2
print(USArrests[['Murder', 'Assault', 'UrbanPop', 'Rape', 'Cluster']])

# Plot K-means clusters on biplot
plt.scatter(pc[:, 0] * 3.5, pc[:, 1] * 3.5, c=clusters_2, s=100, alpha=0.5)
plt.title('K-means Clusters (K=2)')
plt.show()

# K-means with K=5
kmeans_5 = KMeans(n_clusters=5, random_state=42)
clusters_5 = kmeans_5.fit_predict(X_scaled)

# Plot K-means clusters on biplot for K=5
plt.scatter(pc[:, 0] * 3.5, pc[:, 1] * 3.5, c=clusters_5, s=100, alpha=0.5)
plt.title('K-means Clusters (K=5)')
plt.show()

Cluster means:
 [[-0.67675778 -0.68274685 -0.13306084 -0.57037591]
 [ 1.01513667  1.02412028  0.19959126  0.85556386]]
                Murder  Assault  UrbanPop  Rape  Cluster
rownames                                                
Alabama           13.2      236        58  21.2        1
Alaska            10.0      263        48  44.5        1
Arizona            8.1      294        80  31.0        1
Arkansas           8.8      190        50  19.5        0
California         9.0      276        91  40.6        1
Colorado           7.9      204        78  38.7        1
Connecticut        3.3      110        77  11.1        0
Delaware           5.9      238        72  15.8        0
Florida           15.4      335        80  31.9        1
Georgia           17.4      211        60  25.8        1
Hawaii             5.3       46        83  20.2        0
Idaho              2.6      120        54  14.2        0
Illinois          10.4      249        83  24.0        1
Indiana            7.2      113        65  21.0        0
Iowa               2.2       56        57  11.3        0
Kansas             6.0      115        66  18.0        0
Kentucky           9.7      109        52  16.3        0
Louisiana         15.4      249        66  22.2        1
Maine              2.1       83        51   7.8        0
Maryland          11.3      300        67  27.8        1
Massachusetts      4.4      149        85  16.3        0
Michigan          12.1      255        74  35.1        1
Minnesota          2.7       72        66  14.9        0
Mississippi       16.1      259        44  17.1        1
Missouri           9.0      178        70  28.2        1
Montana            6.0      109        53  16.4        0
Nebraska           4.3      102        62  16.5        0
Nevada            12.2      252        81  46.0        1
New Hampshire      2.1       57        56   9.5        0
New Jersey         7.4      159        89  18.8        0
New Mexico        11.4      285        70  32.1        1
New York          11.1      254        86  26.1        1
North Carolina    13.0      337        45  16.1        1
North Dakota       0.8       45        44   7.3        0
Ohio               7.3      120        75  21.4        0
Oklahoma           6.6      151        68  20.0        0
Oregon             4.9      159        67  29.3        0
Pennsylvania       6.3      106        72  14.9        0
Rhode Island       3.4      174        87   8.3        0
South Carolina    14.4      279        48  22.5        1
South Dakota       3.8       86        45  12.8        0
Tennessee         13.2      188        59  26.9        1
Texas             12.7      201        80  25.5        1
Utah               3.2      120        80  22.9        0
Vermont            2.2       48        32  11.2        0
Virginia           8.5      156        63  20.7        0
Washington         4.0      145        73  26.2        0
West Virginia      5.7       81        39   9.3        0
Wisconsin          2.6       53        66  10.8        0
Wyoming            6.8      161        60  15.6        0

C:\Users\baron\AppData\Local\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1446: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(

C:\Users\baron\AppData\Local\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1446: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(

# 2. Hierarchical Clustering and Dendrogram
HC = linkage(X_scaled, method='complete')
plt.figure(figsize=(10, 8))
dendrogram(HC, labels=USArrests.index)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('States')
plt.ylabel('Distance')
plt.show()

# Cutting the dendrogram to create clusters
clusters_hc = cut_tree(HC, n_clusters=5)
USArrests['HC_Cluster'] = clusters_hc.flatten()
print(USArrests[['Murder', 'Assault', 'UrbanPop', 'Rape', 'HC_Cluster']])

                Murder  Assault  UrbanPop  Rape  HC_Cluster
rownames                                                   
Alabama           13.2      236        58  21.2           0
Alaska            10.0      263        48  44.5           1
Arizona            8.1      294        80  31.0           2
Arkansas           8.8      190        50  19.5           3
California         9.0      276        91  40.6           2
Colorado           7.9      204        78  38.7           2
Connecticut        3.3      110        77  11.1           3
Delaware           5.9      238        72  15.8           3
Florida           15.4      335        80  31.9           2
Georgia           17.4      211        60  25.8           0
Hawaii             5.3       46        83  20.2           3
Idaho              2.6      120        54  14.2           4
Illinois          10.4      249        83  24.0           2
Indiana            7.2      113        65  21.0           3
Iowa               2.2       56        57  11.3           4
Kansas             6.0      115        66  18.0           3
Kentucky           9.7      109        52  16.3           3
Louisiana         15.4      249        66  22.2           0
Maine              2.1       83        51   7.8           4
Maryland          11.3      300        67  27.8           2
Massachusetts      4.4      149        85  16.3           3
Michigan          12.1      255        74  35.1           2
Minnesota          2.7       72        66  14.9           3
Mississippi       16.1      259        44  17.1           0
Missouri           9.0      178        70  28.2           3
Montana            6.0      109        53  16.4           4
Nebraska           4.3      102        62  16.5           4
Nevada            12.2      252        81  46.0           2
New Hampshire      2.1       57        56   9.5           4
New Jersey         7.4      159        89  18.8           3
New Mexico        11.4      285        70  32.1           2
New York          11.1      254        86  26.1           2
North Carolina    13.0      337        45  16.1           0
North Dakota       0.8       45        44   7.3           4
Ohio               7.3      120        75  21.4           3
Oklahoma           6.6      151        68  20.0           3
Oregon             4.9      159        67  29.3           3
Pennsylvania       6.3      106        72  14.9           3
Rhode Island       3.4      174        87   8.3           3
South Carolina    14.4      279        48  22.5           0
South Dakota       3.8       86        45  12.8           4
Tennessee         13.2      188        59  26.9           0
Texas             12.7      201        80  25.5           2
Utah               3.2      120        80  22.9           3
Vermont            2.2       48        32  11.2           4
Virginia           8.5      156        63  20.7           3
Washington         4.0      145        73  26.2           3
West Virginia      5.7       81        39   9.3           4
Wisconsin          2.6       53        66  10.8           3
Wyoming            6.8      161        60  15.6           3

# 3. College data - K-means method
! pip install ISLP;
from ISLP import load_data
# Load the Auto dataset from package ISLP
College = load_data('College')

# Create a matrix of numeric variables
X_college = College.select_dtypes(include=[np.number])
print(X_college.shape)  # Check dimensions

# K-means with K=5 for college data
kmeans_college = KMeans(n_clusters=5, random_state=42)
clusters_college = kmeans_college.fit_predict(X_college)

# Cluster means
print('Cluster means for College:\n', kmeans_college.cluster_centers_)

# Clustering vector
College['Cluster'] = clusters_college
print(College[['Cluster']].head())

# Plot pairs of variables with assigned clusters
plt.figure(figsize=(12, 12))
plt.subplot(2, 2, 1)
plt.scatter(College['Outstate'], College['Top10perc'], c=clusters_college)
plt.title('Outstate vs Top10perc')

plt.subplot(2, 2, 2)
plt.scatter(College['S.F.Ratio'], College['PhD'], c=clusters_college)
plt.title('S.F.Ratio vs PhD')

plt.subplot(2, 2, 3)
plt.scatter(College['Apps'], College['Enroll'], c=clusters_college)
plt.title('Apps vs Enroll')

plt.subplot(2, 2, 4)
plt.scatter(College['Room.Board'], College['Private'], c=clusters_college)
plt.title('Room.Board vs Private')

plt.tight_layout()
plt.show()

(777, 17)
Cluster means for College:
 [[1.18157500e+03 8.98534091e+02 3.55340909e+02 2.12704545e+01
  4.85340909e+01 1.49989773e+03 4.97884091e+02 9.18096364e+03
  4.10183182e+03 5.29795455e+02 1.31207273e+03 6.50954545e+01
  7.27500000e+01 1.44000000e+01 2.11704545e+01 7.63585227e+03
  6.27045455e+01]
 [1.50671143e+04 1.00431714e+04 3.84400000e+03 4.15428571e+01
  7.40285714e+01 1.96352571e+04 3.57811429e+03 9.46885714e+03
  4.57285714e+03 6.03000000e+02 1.75028571e+03 8.67142857e+01
  9.19428571e+01 1.57485714e+01 1.76000000e+01 1.11875143e+04
  6.57428571e+01]
 [2.62470732e+03 1.72104268e+03 5.27079268e+02 4.00487805e+01
  6.91463415e+01 2.09468902e+03 2.83243902e+02 1.57379268e+04
  5.25655488e+03 5.78481707e+02 1.04590854e+03 8.34207317e+01
  9.02682927e+01 1.14146341e+01 3.27012195e+01 1.37479878e+04
  7.68414634e+01]
 [5.78270000e+03 3.98210000e+03 1.71863333e+03 2.22333333e+01
  5.34583333e+01 9.13747500e+03 2.23692500e+03 6.96022500e+03
  3.76000000e+03 5.61841667e+02 1.73805833e+03 7.84916667e+01
  8.46583333e+01 1.72633333e+01 1.43833333e+01 7.45545000e+03
  5.63416667e+01]
 [8.92527778e+03 3.42455556e+03 1.24844444e+03 7.57777778e+01
  9.18333333e+01 4.86738889e+03 2.98944444e+02 1.80621667e+04
  5.98150000e+03 5.75666667e+02 1.27838889e+03 9.33333333e+01
  9.65555556e+01 6.49444444e+00 3.62222222e+01 3.36272778e+04
  8.95000000e+01]]
   Cluster
0        0
1        2
2        0
3        2
4        0

# 4. College data - Hierarchical Clustering
HC_college = linkage(X_college, method='complete')
plt.figure(figsize=(10, 8))
dendrogram(HC_college, labels=College.index)
plt.title('Hierarchical Clustering Dendrogram (College)')
plt.xlabel('Colleges')
plt.ylabel('Distance')
plt.show()

# Random sample for hierarchical clustering
Z = np.random.choice(College.index, 20, replace=False)
Y = X_college.loc[Z]
HCZ = linkage(Y, method='complete')
plt.figure(figsize=(10, 8))
dendrogram(HCZ, labels=Y.index)
plt.title('Hierarchical Clustering Dendrogram (Sample Colleges)')
plt.xlabel('Colleges')
plt.ylabel('Distance')
plt.show()

# Create clusters from the hierarchical clustering
HC4 = cut_tree(HC_college, n_clusters=4)
College['HC_Cluster'] = HC4.flatten()
print(College[['HC_Cluster']].head())

   HC_Cluster
0           0
1           0
2           0
3           0
4           0